Renaming Column Names for easy code handling and Cleaning Data
e <- read_excel("~/SharedFiles/ST606/2020/data/Exercise/fit_database_anthropometric_all.xlsx")
e_normal <- read_excel("~/SharedFiles/ST606/2020/data/Exercise/fit_database_exercise_normal.xlsx")
#Renaming Variables
names(e)[names(e)=='measurement date']<- 'measurement_date'
names(e)[names(e)=='age (years)']<- 'age_years'
names(e)[names(e)=='age bin']<- 'age_bin'
#names(e)[names(e)=='z-category (WHO)']<- 'z_cat_WHO'
names(e)[length(e)]<-"z_cat_WHO"
names(e)[9]<-"z_score_WHO"
names(e)[6]<-"height_cm"
names(e)[7]<-"weight_kg"
#Dealing with NAs
ena<-na.omit(e)
any(is.na(ena$z_cat_WHO))
## [1] FALSE
#Changing Data Types
ena <- ena %>%
mutate(gender = as.factor(gender),
z_cat_WHO=as.factor(z_cat_WHO),
measurement_date=as.Date(measurement_date),
BMI = as.numeric(BMI),
z_score_WHO=as.numeric(z_score_WHO),
height_cm=as.numeric(height_cm),
weight_kg=as.numeric(weight_kg))
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
#Removing "NA" Characters
ena$observation <- 1:nrow(ena)
x<-ena[ena$z_cat_WHO=="NA",]
y<-x$observation
ena<-ena[-y,]
#Adding additional column
ena$year <- year(ena$measurement_date)
Diving data as per Year
ena_2007 <- filter(ena, year(measurement_date) == 2007)
ena_2008 <- filter(ena, year(measurement_date) == 2008)
ena_2009 <- filter(ena, year(measurement_date) == 2009)
ena_2010 <- filter(ena, year(measurement_date) == 2010)
ena_2011 <- filter(ena, year(measurement_date) == 2011)
ena_2012 <- filter(ena, year(measurement_date) == 2012)
ena_2013 <- filter(ena, year(measurement_date) == 2013)
ena_2014 <- filter(ena, year(measurement_date) == 2014)
ena_2015 <- filter(ena, year(measurement_date) == 2015)
ena_2016 <- filter(ena, year(measurement_date) == 2016)
ena_2017 <- filter(ena, year(measurement_date) == 2017)
ena_2018 <- filter(ena, year(measurement_date) == 2018)
Checking uniques IDs whose observation has been taken continuously from 2007 to 2018
a_07_08 <- subset(ena_2007, ID %in% ena_2008$ID)
a_07_09 <- subset(a_07_08, ID %in% ena_2009$ID)
a_07_10 <- subset(a_07_09, ID %in% ena_2010$ID)
a_07_11 <- subset(a_07_10, ID %in% ena_2011$ID)
a_07_12 <- subset(a_07_11, ID %in% ena_2012$ID)
a_07_13 <- subset(a_07_12, ID %in% ena_2013$ID)
a_07_14 <- subset(a_07_13, ID %in% ena_2014$ID)
a_07_15 <- subset(a_07_14, ID %in% ena_2015$ID)
a_07_16 <- subset(a_07_15, ID %in% ena_2016$ID)
a_07_17 <- subset(a_07_16, ID %in% ena_2017$ID)
a_07_18 <- subset(a_07_17, ID %in% ena_2018$ID)
#a_07_18 <- kids data of 2007 whose obeservation were taken till the end of 2018 without any miss.
a_unique_07<- subset(ena, ID %in% a_07_18$ID)
#a_unique_07 = All data of the a_07_18 dataset kids from 2007 to 2018.
#Measumements taken on Oct
Oct_07<-a_unique_07 %>% filter(month(measurement_date) %in% c(10))
by_year_cat_Oct_07<-Oct_07 %>% group_by(year, z_cat_WHO) %>% summarise(count=n())
ggplot(by_year_cat_Oct_07)+
geom_col(aes(x=z_cat_WHO,y=count))+
facet_wrap(~year)
Prop_07_18_Oct<-by_year_cat_Oct_07 %>% group_by(year) %>%
summarise(total=sum(count)) %>%
merge(., by_year_cat_Oct_07, all.y=TRUE) %>%
mutate(Proportion=count*100/total) %>%
select(year, z_cat_WHO, count, Proportion)
#Measumements taken on April
Apr_07<-a_unique_07 %>% filter(month(measurement_date) %in% c(4))
by_year_cat_Apr_07<-Apr_07 %>% group_by(year, z_cat_WHO) %>% summarise(count=n())
ggplot(by_year_cat_Apr_07)+
geom_col(aes(x=z_cat_WHO,y=count))+
facet_wrap(~year)
Prop_07_18_Apr<-by_year_cat_Apr_07 %>% group_by(year) %>%
summarise(total=sum(count)) %>%
merge(., by_year_cat_Apr_07, all.y=TRUE) %>%
mutate(Proportion=count*100/total) %>%
select(year, z_cat_WHO, count, Proportion)
#Checking the number of age counts in 2007
age_check_07 <- a_07_18 %>% group_by(age_bin, year) %>% summarise(count=n())
#Checking height
#Checking the Shortest and Tallest in 2007 and check throughout growth
#Checking 5 shortest heights in 2007 (boy and girl)
shortest_07_boy <- a_07_18 %>% arrange(height_cm) %>% filter((gender) %in% c("boy"))%>% head(5)
shortest_07_boy_IDs<-shortest_07_boy$ID
shortest_07_girl <- a_07_18 %>% arrange(height_cm) %>% filter((gender) %in% c("girl")) %>% head(5)
shortest_07_girl_IDs<-shortest_07_girl$ID
#Checking prop increase per year
a_unique_07_pct<-a_unique_07 %>%
group_by(ID) %>%
arrange(measurement_date, .by_group = TRUE) %>%
mutate(pct_change = (height_cm/lag(height_cm) - 1) * 100) %>%
select(ID, measurement_date, height_cm, pct_change)
#Checking growth rate of the shortest kids from 2007 throught 2018
#BOY
shortest_boy<-a_unique_07_pct %>% filter((ID) %in% c(shortest_07_boy_IDs)) %>%
select(ID, measurement_date, height_cm, pct_change)
ggplot(shortest_boy, aes(x=measurement_date, y = pct_change ))+
geom_line()+facet_wrap(~ID)+
geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).
#GIRL
shortest_girl<-a_unique_07_pct %>% filter((ID) %in% c(shortest_07_girl_IDs)) %>%
select(ID, measurement_date, height_cm, pct_change)
ggplot(shortest_girl, aes(x=measurement_date, y = pct_change ))+
geom_line()+facet_wrap(~ID)+
geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).
#Negative growth rate check (Wrong Data)
shortest_girl%>% filter((ID)%in%c(262))
## # A tibble: 21 x 4
## # Groups: ID [1]
## ID measurement_date height_cm pct_change
## <dbl> <date> <dbl> <dbl>
## 1 262 2007-10-01 116 NA
## 2 262 2008-04-01 120 3.45
## 3 262 2008-10-01 122 1.67
## 4 262 2009-04-01 125 2.46
## 5 262 2009-10-01 127 1.6
## 6 262 2010-04-01 131 3.15
## 7 262 2010-10-01 135 3.05
## 8 262 2011-04-01 137 1.48
## 9 262 2011-10-01 138 0.730
## 10 262 2012-04-01 141 2.17
## # … with 11 more rows
#Checking 5 tallest heights in 2007 (boy and girl)
tallest_07_boy <- a_07_18 %>% arrange(desc(height_cm)) %>% filter((gender) %in% c("boy"))%>% head(5)
tallest_07_boy_IDs<-tallest_07_boy$ID
tallest_07_girl <- a_07_18 %>% arrange(desc(height_cm)) %>% filter((gender) %in% c("girl")) %>% head(5)
tallest_07_girl_IDs<-tallest_07_girl$ID
#Checking growth rate of the shortest kids from 2007 throught 2018
#BOY
tallest_boy<-a_unique_07_pct %>% filter((ID) %in% c(tallest_07_boy_IDs)) %>%
select(ID, measurement_date, height_cm, pct_change)
ggplot(tallest_boy, aes(x=measurement_date, y = pct_change ))+
geom_line()+facet_wrap(~ID)+
geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).
#GIRL
tallest_girl<-a_unique_07_pct %>% filter((ID) %in% c(tallest_07_girl_IDs)) %>%
select(ID, measurement_date, height_cm, pct_change)
ggplot(tallest_girl, aes(x=measurement_date, y = pct_change ))+
geom_line()+facet_wrap(~ID)+
geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).
#Negative growth rate check (Wrong Data)
tallest_girl%>% filter((ID)%in%c(340))
## # A tibble: 20 x 4
## # Groups: ID [1]
## ID measurement_date height_cm pct_change
## <dbl> <date> <dbl> <dbl>
## 1 340 2007-10-01 137 NA
## 2 340 2008-04-01 140 2.19
## 3 340 2009-04-01 146 4.29
## 4 340 2009-10-01 149 2.05
## 5 340 2010-04-01 154 3.36
## 6 340 2010-10-01 156 1.30
## 7 340 2011-04-01 161 3.21
## 8 340 2011-10-01 162 0.621
## 9 340 2012-04-01 163 0.617
## 10 340 2012-10-01 164 0.613
## 11 340 2013-04-01 165 0.610
## 12 340 2013-10-01 165 0
## 13 340 2014-04-01 166 0.606
## 14 340 2014-10-01 165 -0.602
## 15 340 2015-04-01 168 1.82
## 16 340 2016-04-01 168 0
## 17 340 2016-10-01 167 -0.595
## 18 340 2017-04-01 177 5.99
## 19 340 2017-10-01 167 -5.65
## 20 340 2018-04-01 167 0
#Checking the height in 2018 and investigating their growth from 2007 to 2018.
#Checking the Shortest and Tallest in 2018 and check throughout growth
#Finding IDs of the shortest kids on 2018
shortest_boy_18<-a_unique_07 %>% filter((year(measurement_date)) %in% c(2018)) %>%
arrange(height_cm) %>% filter((gender) %in% c("boy")) %>% head(5)
shortest_boy_18_IDs <-shortest_boy_18$ID
shortest_girl_18<-a_unique_07 %>% filter((year(measurement_date)) %in% c(2018)) %>%
arrange(height_cm) %>% filter((gender) %in% c("girl")) %>% head(5)
shortest_girl_18_IDs <-shortest_girl_18$ID
#Checking growth rate of the shortest kids from 2007 throught 2018
#BOY
shortest_boy_all<-a_unique_07_pct %>% filter((ID) %in% c(shortest_boy_18_IDs)) %>%
select(ID, measurement_date, height_cm, pct_change)
ggplot(shortest_boy_all, aes(x=measurement_date, y = pct_change ))+
geom_line()+facet_wrap(~ID)+
geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).
#GIRL
shortest_girl_all<-a_unique_07_pct %>% filter((ID) %in% c(shortest_girl_18_IDs)) %>%
select(ID, measurement_date, height_cm, pct_change)
ggplot(shortest_girl_all, aes(x=measurement_date, y = pct_change ))+
geom_line()+facet_wrap(~ID)+
geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).
#Negative growth rate check (Wrong Data)
shortest_girl_all%>% filter((ID)%in%c(262))
## # A tibble: 21 x 4
## # Groups: ID [1]
## ID measurement_date height_cm pct_change
## <dbl> <date> <dbl> <dbl>
## 1 262 2007-10-01 116 NA
## 2 262 2008-04-01 120 3.45
## 3 262 2008-10-01 122 1.67
## 4 262 2009-04-01 125 2.46
## 5 262 2009-10-01 127 1.6
## 6 262 2010-04-01 131 3.15
## 7 262 2010-10-01 135 3.05
## 8 262 2011-04-01 137 1.48
## 9 262 2011-10-01 138 0.730
## 10 262 2012-04-01 141 2.17
## # … with 11 more rows
#Checking the height in 2018 and investigating their growth from 2007 to 2018.
#Finding IDs of the tallest kids on 2018
tallest_boy_18<-a_unique_07 %>% filter((year(measurement_date)) %in% c(2018)) %>%
arrange(desc(height_cm)) %>% filter((gender) %in% c("boy")) %>% head(5)
tallest_boy_18_IDs <-tallest_boy_18$ID
tallest_girl_18<-a_unique_07 %>% filter((year(measurement_date)) %in% c(2018)) %>%
arrange(desc(height_cm)) %>% filter((gender) %in% c("girl")) %>% head(5)
tallest_girl_18_IDs <-tallest_girl_18$ID
#Checking growth rate of the shortest kids from 2007 throught 2018
#BOY
tallest_boy_all<-a_unique_07_pct %>% filter((ID) %in% c(tallest_boy_18_IDs)) %>%
select(ID, measurement_date, height_cm, pct_change)
ggplot(tallest_boy_all, aes(x=measurement_date, y = pct_change ))+
geom_line()+facet_wrap(~ID)+
geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).
#GIRL
tallest_girl_all<-a_unique_07_pct %>% filter((ID) %in% c(tallest_girl_18_IDs)) %>%
select(ID, measurement_date, height_cm, pct_change)
ggplot(tallest_girl_all, aes(x=measurement_date, y = pct_change ))+
geom_line()+facet_wrap(~ID)+
geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).
#Negative growth rate check (Wrong Data)
tallest_girl_all%>% filter((ID)%in%c(898))
## # A tibble: 22 x 4
## # Groups: ID [1]
## ID measurement_date height_cm pct_change
## <dbl> <date> <dbl> <dbl>
## 1 898 2007-10-01 142 NA
## 2 898 2008-04-01 148 4.23
## 3 898 2008-10-01 148 0
## 4 898 2009-04-01 152 2.70
## 5 898 2009-10-01 155 1.97
## 6 898 2010-04-01 160 3.23
## 7 898 2010-10-01 163 1.88
## 8 898 2011-04-01 169 3.68
## 9 898 2011-10-01 171 1.18
## 10 898 2012-04-01 171 0
## # … with 12 more rows
#Checking growth of all boys and girls combined
mod <- sitar(x = age_years, y = height_cm, id = ID, data = a_unique_07, df = 5)
par(mar = c(4,4,1,1) + 0.1, cex = 0.8)
#Growth curve for all IDs
mplot(x = age_years, y = height_cm, id = ID, data = a_unique_07, col = ID, las = 1)
plot(mod, opt = 'a', col = ID, las = 1, xlim = xaxsd(), ylim = yaxsd())
#Growth curve for all IDs by boys and girls
mplot(x = age_years, y = height_cm, id = ID, data = a_unique_07, col = gender, las = 1)
plot(mod, opt = 'a', col = gender, las = 1, xlim = xaxsd(), ylim = yaxsd())
legend("bottomleft", legend=c("Black -> Boys", "Red -> Girls"), col=c("red", "blue"))
par(mar = c(4,4,1,1) + 0.1, cex = 0.8)
plot(mod, opt = 'd', las = 1, apv = TRUE)
## apv pv
## 12.16 8.35
plot(mod, opt = 'v', las = 1, apv = TRUE, lty = 2)
## apv pv
## 12.16 8.35
par(mar = c(4,4,1,1) + 0.1, cex = 0.8)
plot(mod, opt = 'u', las = 1, col = 8, lwd = 0.5)
lines(mod, opt = 'd', lty = 2)
lines(mod, opt = 'ua', col = 4, subset = ID == 312)
lines(mod, opt = 'ua', col = 2, subset = ID == 277)
lines(mod, opt = 'ua', col = 6, subset = ID == 299)
lines(mod, opt = 'ua', col = 3, subset = ID == 310)
lines(mod, opt = 'ua', col = 1, subset = ID == 3387)
legend('bottomright', c('ID 312', 'ID 277','ID 299','ID 310','ID 3387','mean'),lty = c(1,1,1,1,1,2), col = c(4,2,6,3,8), cex = 0.8, inset=0.04)
pairs(ranef(mod), labels = c('size', 'timing', 'intensity'), pch=20)
#Dividing a_07_18 : boys and girls and show the growth curve : making 2 datasets from a_unique_07 (boys and girls) and fiting sitar
only_boy_07_18<-a_07_18 %>% filter((gender) %in% c('boy'))
boy_ID<-only_boy_07_18$ID
only_girl_07_18<-a_07_18 %>% filter((gender) %in% c('girl'))
girl_ID<-only_girl_07_18$ID
a_unique_07_boy<-a_unique_07 %>% filter((ID) %in% c(boy_ID))
a_unique_07_girl<-a_unique_07 %>% filter((ID) %in% c(girl_ID))
#Checking growth of only boys
mod_boy <- sitar(x = age_years, y = height_cm, id = ID, data = a_unique_07_boy, df = 5)
## Warning in nlme.formula(y ~ fitnlme(x, s1, s2, s3, s4, s5, a, b, c), fixed = s1
## + : Iteration 3, LME step: nlminb() did not converge (code = 1). PORT message:
## false convergence (8)
par(mar = c(4,4,1,1) + 0.1, cex = 0.8)
#Growth curve for all boys IDs
mplot(x = age_years, y = height_cm, id = ID, data = a_unique_07_boy, col = ID, las = 1)
plot(mod_boy, opt = 'a', col = ID, las = 1, xlim = xaxsd(), ylim = yaxsd())
par(mar = c(4,4,1,1) + 0.1, cex = 0.8)
plot(mod_boy, opt = 'd', las = 1, apv = TRUE)
## apv pv
## 13.340 9.757
plot(mod_boy, opt = 'v', las = 1, apv = TRUE, lty = 2)
## apv pv
## 13.340 9.757
par(mar = c(4,4,1,1) + 0.1, cex = 0.8)
plot(mod_boy, opt = 'u', las = 1, col = 8, lwd = 0.5)
lines(mod_boy, opt = 'd', lty = 2)
pairs(ranef(mod_boy), labels = c('size', 'timing', 'intensity'), pch=20)
#Checking growth of only girls
mod_girl <- sitar(x = age_years, y = height_cm, id = ID, data = a_unique_07_girl, df = 5)
par(mar = c(4,4,1,1) + 0.1, cex = 0.8)
#Growth curve for all boys IDs
mplot(x = age_years, y = height_cm, id = ID, data = a_unique_07_girl, col = ID, las = 1)
plot(mod_girl, opt = 'a', col = ID, las = 1, xlim = xaxsd(), ylim = yaxsd())
par(mar = c(4,4,1,1) + 0.1, cex = 0.8)
plot(mod_girl, opt = 'd', las = 1, apv = TRUE)
## apv pv
## 11.380 7.303
plot(mod_girl, opt = 'v', las = 1, apv = TRUE, lty = 2)
## apv pv
## 11.380 7.303
par(mar = c(4,4,1,1) + 0.1, cex = 0.8)
plot(mod_girl, opt = 'u', las = 1, col = 8, lwd = 0.5)
lines(mod_girl, opt = 'd', lty = 2)
pairs(ranef(mod_girl), labels = c('size', 'timing', 'intensity'), pch=20)
e_normal<-read_excel("~/SharedFiles/ST606/2020/data/Exercise/fit_database_exercise_normal.xlsx", na="NA")
ena_normal <- na.omit(e_normal)
nrow(ena_normal)
## [1] 40518
colnames(ena_normal)
## [1] "ID" "measurement date" "age (years)"
## [4] "age bin" "gender" "height (cm)"
## [7] "weight (kg)" "BMI" "z-score (WHO)"
## [10] "z-category (WHO)" "running distance (m)" "running time (s)"
## [13] "running speed (m/s)" "pulse 0'" "pulse 1'"
## [16] "pulse 5'" "pulse 10'" "SBP 0'"
## [19] "SBP 1'" "SBP 5'" "SBP 10'"
## [22] "DBP 0'" "DBP 1'" "DBP 5'"
## [25] "DBP 10'"
#And Compare both shortest and tallest
#check weight and BMI and heartrate as well